#!/usr/bin/env python
"""

    final_uncallable_regions.py
    [--log_file PATH]
    [--verbose]

"""

################################################################################
#
#   genomic_interval_diff
#
#
#   Copyright (c) 11/3/2010 Leo Goodstadt
#
#   Permission is hereby granted, free of charge, to any person obtaining a copy
#   of this software and associated documentation files (the "Software"), to deal
#   in the Software without restriction, including without limitation the rights
#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#   copies of the Software, and to permit persons to whom the Software is
#   furnished to do so, subject to the following conditions:
#
#   The above copyright notice and this permission notice shall be included in
#   all copies or substantial portions of the Software.
#
#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
#   THE SOFTWARE.
#################################################################################

import sys, os

# add self to search path for testing
if __name__ == '__main__':
    exe_path = os.path.split(os.path.abspath(sys.argv[0]))[0]
    module_name = os.path.split(sys.argv[0])[1]
    module_name = os.path.splitext(module_name)[0];
else:
    module_name = __name__

# Use import path from <<../python_modules>>
if __name__ == '__main__':
    sys.path.append(os.path.abspath(os.path.join(exe_path,"/home/lg/python_modules")))



#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#   options


#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888


if __name__ == '__main__':
    from optparse import OptionParser
    import StringIO

    parser = OptionParser(version="%prog 1.0", usage = "\n\n    %progs [options]")

    parser.add_option("-I", "--indels_file", dest="indels_file",
                      metavar="FILE",
                      type="string",
                      help="Name and path of file containing per strain indel genomic intervals whose "
                      "four columns must be <strain><contig><beg><end>. Assumes one-based coordinates.")
    parser.add_option("-R", "--repeats_file", dest="repeats_file",
                      metavar="FILE",
                      type="string",
                      help="Name and path of file containing simple repeats in the reference genome whose "
                      "three columns must be <contig><beg><end>. Assumes zero-based coordinates.")
    parser.add_option("-U", "--uncallable_files", dest="uncallable_files",
                      metavar="FILE_SPEC",
                      type="string",
                      help="Wildcard specifying files containing uncallable regions with poor mappability and high read depth."
                      "File names (not_including extensions must match the first column in --indels file."
                      "First three columns must be <contig><beg><end>. Assumes one-based coordinates.")
    parser.add_option("-o", "--output_directory", dest="output_directory",
                      metavar="PATH",
                      type="string",
                      help="Output directory PATH. the file names will be the same those in --uncallable_files.")

    #
    #   general options: verbosity / logging
    #
    parser.add_option("-v", "--verbose", dest = "verbose",
                      action="count", default=0,
                      help="Print more verbose messages for each additional verbose level.")
    parser.add_option("-L", "--log_file", dest="log_file",
                      metavar="FILE",
                      type="string",
                      help="Name and path of log file")
    parser.add_option("--skip_parameter_logging", dest="skip_parameter_logging",
                        action="store_true", default=False,
                        help="Do not print program parameters to log.")
    parser.add_option("--debug", dest="debug",
                        action="count", default=0,
                        help="Set default program parameters in debugging mode.")




    # get help string
    f =StringIO.StringIO()
    parser.print_help(f)
    helpstr = f.getvalue()
    original_args = " ".join(sys.argv)
    (options, remaining_args) = parser.parse_args()

    options.log_file                = os.path.join("logs/final_uncallable_regions.log")
    options.log_parameters          = True


    #vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
    #                                             #
    #   Debug: Change these                       #
    #                                             #
    #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    if not options.indels_file:
        options.indels_file="/data/mus/visitors/lg/short_read_sv/all_indels.expanded_by_15.list"
    if not options.repeats_file:
        options.repeats_file="/data/mus/visitors/lg/ucsc_data/short.simple_repeats.expanded_by_5.data"
    if not options.uncallable_files:
        options.uncallable_files="/data/mus/visitors/lg/uncallable/*.nc"
    if not options.output_directory:
        options.output_directory="/data/mus/visitors/lg/uncallable_including_repeats_indels"
    options.verbose                 = 2
    #vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
    #                                             #
    #   Debug: Change these                       #
    #                                             #
    #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

    #
    #   mandatory options
    #

    mandatory_options = ["indels_file", "repeats_file", "uncallable_files", "output_directory"]
    def check_mandatory_options (options, mandatory_options, helpstr):
        """
        Check if specified mandatory options have been defined
        """
        missing_options = []
        for o in mandatory_options:
            if not getattr(options, o):
                missing_options.append("--" + o)

        if not len(missing_options):
            return

        raise Exception("Missing mandatory parameter%s: %s.\n\n%s\n\n" %
                        ("s" if len(missing_options) > 1 else "",
                         ", ".join(missing_options),
                         helpstr))
    check_mandatory_options (options, mandatory_options, helpstr)


#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#   imports


#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888


#from json import dumps
from collections import defaultdict
from intervals import t_intervals
from sorted_by_embedded_numbers import sorted_by_embedded_numbers
from glob import glob


#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#   Functions


#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#_________________________________________________________________________________________

#   load_intervals
#_________________________________________________________________________________________
def load_intervals (file_name, no_strain, one_based_inclusive):
    """
    Load data from tab delimited genomic data file inlcuding extra fields
    """
    logger.log(MESSAGE, "    Load data from %s" % file_name)
    intervals_by_contig = defaultdict(lambda:defaultdict(t_intervals))

    sys.stderr.write("        [")
    for cnt_lines, line in enumerate(open(file_name)):
        if cnt_lines % 100000 == 0:
            sys.stderr.write(".")
        if not len(line) or line[0] == '#':
            continue
        data = line.rstrip().split('\t')
        if len(data) < 3:
            logger.warning("Too few fields (<3): %s" % line)
            continue
        if not no_strain:
            strain = data.pop(0)
        else:
            strain = "NO_STRAIN"
        if len(data) < 3:
            logger.warning("Too few fields (<3): %s" % line)
            continue
        contig, beg, end = data[0:3]
        beg = int(beg)
        end = int (end)
        if one_based_inclusive:
            beg = beg - 1
        beg = beg - 1
        if contig[0:3] == 'chr':
            contig = contig[3:]
        intervals_by_contig[strain][contig].append((beg, end))
    sys.stderr.write("]\n")
    return intervals_by_contig
#_________________________________________________________________________________________

#   save_intervals
#_________________________________________________________________________________________
def save_intervals (intervals_by_contig, file_name):
    """
    Load data from tab delimited genomic data file inlcuding extra fields
    """
    logger.log(MESSAGE, "    Save intervals to %s" % file_name)
    result_file = open(file_name, "w")
    for contig in sorted_by_embedded_numbers(intervals_by_contig.keys()):
        for items in intervals_by_contig[contig].data:
            result_file.write(contig + "\t")
            #if options.one_based_inclusive:
            result_file.write(str(items[0] + 1) + "\t")
            result_file.write("\t".join(map(str, items[1:])) + "\n")

#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#   Logger


#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

if __name__ == '__main__':
    import logging
    import logging.handlers

    MESSAGE = 15
    logging.addLevelName(MESSAGE, "MESSAGE")

    def setup_std_logging (logger, log_file, verbose):
        """
        set up logging using programme options
        """
        class debug_filter(logging.Filter):
            """
            Ignore INFO messages
            """
            def filter(self, record):
                return logging.INFO != record.levelno

        class NullHandler(logging.Handler):
            """
            for when there is no logging
            """
            def emit(self, record):
                pass

        # We are interesting in all messages
        logger.setLevel(logging.DEBUG)
        has_handler = False

        # log to file if that is specified
        if log_file and len(log_file):
            handler = logging.FileHandler(log_file, delay=False)
            handler.setFormatter(logging.Formatter("%(asctime)s - %(name)s - %(levelname)6s - %(message)s"))
            handler.setLevel(MESSAGE)
            logger.addHandler(handler)
            has_handler = True

        # log to stderr if verbose
        if verbose:
            stderrhandler = logging.StreamHandler(sys.stderr)
            stderrhandler.setFormatter(logging.Formatter("    %(message)s"))
            stderrhandler.setLevel(logging.DEBUG)
            if log_file:
                stderrhandler.addFilter(debug_filter())
            logger.addHandler(stderrhandler)
            has_handler = True

        # no logging
        if not has_handler:
            logger.addHandler(NullHandler())


    #
    #   set up log
    #
    logger = logging.getLogger(module_name)
    setup_std_logging(logger, options.log_file, options.verbose)


    #
    #   log programme parameters
    #
    if not options.skip_parameter_logging:
        programme_name = os.path.split(sys.argv[0])[1]
        logger.info("%s %s" % (programme_name, original_args))

#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#   Main logic


#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
if __name__ == '__main__':
#   debug code not run if called as a module
#


    indels          = load_intervals (options.indels_file, False, True)
    repeats         = load_intervals (options.repeats_file, True, False)
    repeats         = repeats["NO_STRAIN"]

    logger.log(MESSAGE, "Merging indels and repeats for:")
    for strain_name in indels:
        logger.log(MESSAGE, "    %s" % (strain_name))

        #
        #   merge repeats into indels
        #
        for contig in indels[strain_name]:
            if contig in repeats:
                indels[strain_name][contig] = indels[strain_name][contig].union(repeats[contig])
        #
        #   copy repeat contigs not in indels in their entirety
        #
        for contig in repeats:
            if contig not in indels[strain_name]:
                indels[strain_name][contig] = repeats[contig]


    if not os.path.exists(options.output_directory):
        os.makedirs(options.output_directory)

    logger.log(MESSAGE, "Calculating uncallable regions per strain:")
    for file_path in glob(options.uncallable_files):
        file_name = os.path.split(file_path)[1]
        output_path = os.path.join(options.output_directory, file_name)
        strain_name = os.path.splitext(file_name)[0]
        logger.log(MESSAGE, strain_name)
        #
        #   load intervals
        #
        original = load_intervals (file_path, True, True)["NO_STRAIN"]
        logger.debug("    Calculating union...")
        for contig in original:
            if contig in indels[strain_name]:
                original[contig] = original[contig].union(indels[strain_name][contig])

        save_intervals(original, output_path)

